At DEF CON 22, the FTC ran a contest to help mitigate robocalls. There were three rounds, the last of which was using a set of call records collected from a robocall honeypot to determine if a caller was a robocaller. See Parts I and II of the contest for details on robocaller honeypots.
The FTC gave us two sets of data, that show a phone call from one "person" to another along with the date and time. Both collections have been randomized uniquely, but the portions of area code and subscriber number were kept the same.
This Notebook is a follow up to Analyzing Rachel the Robo Caller and details creating a Random Forest classifier to predict robocallers.
In [31]:
from IPython.display import Image
Image("http://www.ftc.gov/system/files/attachments/zapping-rachel/zapping-rachel-contest.jpg")
Out[31]:
In [6]:
%matplotlib inline
# Standard toolkits in pydata land
import pandas as pd
import numpy as np
# Exploring the use of a RandomForest
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
In [16]:
def read_FTC(dataset):
'''Reads the csv format that the FTC provided for the Rachel the Robocaller contest into a pandas DataFrame'''
return pd.read_csv(dataset,
parse_dates=["DATE/TIME"],
converters={'LIKELY ROBOCALL': lambda val: val == 'X'},
dtype={'TO': str, 'FROM': str, 'LIKELY ROBOCALL': bool}
)
In [34]:
def extract_features(ftc_row):
ftc_row["HOUR"] = ftc_row["DATE/TIME"].hour
ftc_row["MINUTE"] = ftc_row["DATE/TIME"].minute
# Extract the area code using slicing since they are all regular US numbers
ftc_row["TO_AREA_CODE"] = ftc_row["TO"][1:4]
ftc_row["FROM_AREA_CODE"] = ftc_row["FROM"][1:4]
# Extract area code + "office code"
ftc_row["TO_OFFICE_CODE"] = ftc_row["TO"][1:7]
ftc_row["FROM_OFFICE_CODE"] = ftc_row["FROM"][1:7]
dt = ftc_row["DATE/TIME"]
ftc_row["TIMECHUNK"] = dt.hour + np.floor(4*(dt.minute/60.0))/4
return ftc_row
In [35]:
def total_call_volume(df, direction="FROM"):
sizes = df.groupby(direction).size()
def get_size(val):
return sizes[val]
return df[direction].apply(get_size)
In [36]:
def massage_ftc_dataframe(ftc_dataframe):
massaged = ftc_dataframe.apply(extract_features, axis=1)
massaged["NUM_FROM_CALLS"] = total_call_volume(massaged, "FROM")
massaged["NUM_TO_CALLS"] = total_call_volume(massaged, "TO")
return massaged
In [24]:
labeled_data = read_FTC("FTC-DEFCON Data Set 1.csv")
unlabeled_data = read_FTC("FTC-DEFCON Data Set 2.csv")
In [41]:
# This assumes you have the data locally
massaged_labeled_data = massage_ftc_dataframe(labeled_data)
massaged_labeled_data.head()
Out[41]:
In [42]:
massaged_unlabeled_data = massage_ftc_dataframe(unlabeled_data)
massaged_unlabeled_data.head()
Out[42]:
In [43]:
massaged_unlabeled_data.tail()
Out[43]:
In [44]:
massaged_labeled_data.columns
Out[44]:
In [53]:
# Scoring system for contest
# Not 0-1 loss...
def score(our_predictions, true_results):
'''Scoring system for the FTC contest. Not 0-1 loss.'''
our_score = 0
for i in range(len(true_results)):
if (our_predictions[i] == True and true_results[i] == True):
our_score += 1
if (our_predictions[i] == True and true_results[i] == False):
our_score -= 1
return our_score
# features is only a copy of the dataframe, can't use this
#def label_encode(features, feature_name):
# feature_encoder = preprocessing.LabelEncoder()
# features[feature_name] = feature_encoder.fit_transform(features[feature_name])
# return feature_encoder
def enriched_data_to_features(enriched_data):
'''Takes a pandas DataFrame with enriched FTC data, returns features and target labels.'''
categorical_feature_names = [
"TO_AREA_CODE",
"FROM_AREA_CODE",
"TO_OFFICE_CODE",
"FROM_OFFICE_CODE",
#"TOTZ",
#"FROMTZ",
#"SAMEAREACODE",
#"WITHIN_THREE_MINUTES",
#"FROMVALID",
"TIMECHUNK",
#"ISWEEKDAY", # Undecided on whether this will generalize since
# training and test data have different weekdays
# And the labeled data is missing Mondays
]
numerical_feature_names = ["NUM_FROM_CALLS", "NUM_TO_CALLS"]
feature_names = categorical_feature_names + numerical_feature_names
features = enriched_data[feature_names]
for feature_name in categorical_feature_names:
print("Creating categorical feature {}".format(feature_name))
encoder = preprocessing.LabelEncoder()
features[feature_name] = encoder.fit_transform(features[feature_name])
target = enriched_data["LIKELY ROBOCALL"].values
return features, target
def train(features, target, min_samples_split=285):
classifier = RandomForestClassifier(n_estimators=200,
verbose=0,
n_jobs=-1,
min_samples_split=min_samples_split,
random_state=1,
oob_score=True)
classifier.fit(features, target)
print("Resulting OOB Score: {}".format(classifier.oob_score_))
return classifier
In [48]:
# Separate into training and test sets based on FROM
# This won't be needed when reading in testing data set;
# for that, train on full data and then use .predict()
from_numbers = massaged_labeled_data["FROM"].unique()
# 70% / 30%
num_train = int(round(.7 * len(from_numbers)))
num_test = len(from_numbers) - num_train
train_samples = np.random.choice(from_numbers, num_train)
train_data = massaged_labeled_data[massaged_labeled_data['FROM'].isin(train_samples)]
test_data = massaged_labeled_data[~massaged_labeled_data['FROM'].isin(train_samples)]
In [58]:
# For development
print("Enriching Training Data")
train_features, train_target = enriched_data_to_features(train_data)
print("Enriching Testing Data")
test_features, test_target = enriched_data_to_features(test_data)
min_samples_split_values = np.arange(150, 290, 5)
num_parameter_trials = len(min_samples_split_values)
# create dataframe
score_frame = pd.DataFrame(index=np.arange(0, num_parameter_trials), columns=('min_samples_split', 'test_score', 'train_score') )
for trial in np.arange(0, num_parameter_trials):
c = min_samples_split_values[trial]
classifier = train(train_features, train_target, c)
our_predictions = classifier.predict(test_features)
our_train_predictions = classifier.predict(train_features)
score_frame.loc[trial] = [c, score(our_predictions, test_target), score(our_train_predictions, train_target) ]
In [59]:
score_frame
Out[59]:
In [64]:
# For the sake of the contest now, we'll train on the entire FTC1 dataset and then fit to the FTC2 dataset
train_data = massaged_labeled_data
test_data = massaged_unlabeled_data
train_features, train_target = enriched_data_to_features(train_data)
test_features, _ = enriched_data_to_features(test_data)
c = 285 # Determined during contest, not sure of now that I've run through it again
classifier = train(train_features, train_target, c)
predictions = classifier.predict(test_features)
contest_results = unlabeled_data[["FROM", "TO", "DATE/TIME"]]
contest_results["LIKELY ROBOCALL"] = predictions
contest_results["LIKELY ROBOCALL"] = X["LIKELY ROBOCALL"].map(lambda x: "X" if x else "")
contest_results.to_csv("predictions.csv", index=False)
In [65]:
!ls
In [ ]: